Clean up the Data

library(readr)
library(dplyr)
library(stringr)

df <- read.csv('raw_data/coffee_compare.csv')
coffee <- df %>% select(DBA, reinspections, checks, violations, 
                        score, inspections, BORO, SCORE)
coffee$DBA = ifelse(str_detect(coffee$DBA, "DUNKIN"), 'DD', 'Starbucks')
coffee %>% 
    select(BORO, DBA) %>% 
    group_by(BORO) %>% 
    table()
##                DBA
## BORO             DD Starbucks
##   BRONX          92        11
##   BROOKLYN      139        44
##   MANHATTAN     170       242
##   QUEENS        188        42
##   STATEN ISLAND  36         7

Visualization

library(ggthemes)
library(ggplot2)
library(plotly)

coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(Value = n())

pc <- ggplot(coffee_new, aes(fill=DBA, y=Value, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc

pc1 <- ggplot(coffee, aes(x = BORO, y = violations, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'DOHMH') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
ggplotly(pc1)
coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_vio = sum(violations, na.rm = T))

pc11 <- ggplot(coffee_new, 
               aes(fill=DBA, y=total_vio, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc11

pc2 <- ggplot(coffee, aes(x = BORO, y = SCORE, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Total Score for a Particular Inspection') + 
    labs(caption = 'DOHMH') +
    ggtitle('Score of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
ggplotly(pc2)
coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_score = sum(SCORE, na.rm = T))

pc11 <- ggplot(coffee_new, 
               aes(fill=DBA, y=total_score, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Total Score for a Particular Inspection') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Inspection Score of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc11

Linear Regression Analysis

coffee_lm <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(total_num = n(),
              total_vio = sum(violations, na.rm = T),
              total_score = sum(SCORE, na.rm = T))
coffee_lm$DBA = ifelse(coffee_lm$DBA == "DD", 1, 0)

coffee_sb <- coffee_lm %>% filter(DBA == 1)
coffee_dd <- coffee_lm %>% filter(DBA == 0)
lm <- lm(total_num ~ total_vio + total_score,
         data = coffee_sb)
summary(lm)
## 
## Call:
## lm(formula = total_num ~ total_vio + total_score, data = coffee_sb)
## 
## Residuals:
##       1       2       3       4       5 
## -3.7532  1.8681  7.7430 -5.9649  0.1071 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.04674    8.97407  -0.674    0.570
## total_vio    0.19675    0.09361   2.102    0.170
## total_score -0.10594    0.11825  -0.896    0.465
## 
## Residual standard error: 7.521 on 2 degrees of freedom
## Multiple R-squared:  0.9926, Adjusted R-squared:  0.9851 
## F-statistic: 133.4 on 2 and 2 DF,  p-value: 0.007442
lm <- lm(total_num ~ total_vio + total_score,
         data = coffee_dd)
summary(lm)
## 
## Call:
## lm(formula = total_num ~ total_vio + total_score, data = coffee_dd)
## 
## Residuals:
##       1       2       3       4       5 
##  2.9596  0.4643  0.2344 -1.9530 -1.7052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.13450    1.62027   0.700    0.556
## total_vio    0.14931    0.09474   1.576    0.256
## total_score -0.01807    0.10081  -0.179    0.874
## 
## Residual standard error: 2.806 on 2 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9992 
## F-statistic:  2443 on 2 and 2 DF,  p-value: 0.0004092
lm1 <- lm(total_vio ~ total_num,
         data = coffee_sb)
summary(lm1)
## 
## Call:
## lm(formula = total_vio ~ total_num, data = coffee_sb)
## 
## Residuals:
##      1      2      3      4      5 
##  69.29 -51.87 -44.06  48.47 -21.82 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  78.8885    70.8358   1.114 0.346613    
## total_num     8.7481     0.5185  16.872 0.000453 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.92 on 3 degrees of freedom
## Multiple R-squared:  0.9896, Adjusted R-squared:  0.9861 
## F-statistic: 284.7 on 1 and 3 DF,  p-value: 0.0004534
lm1 <- lm(total_vio ~ total_num,
         data = coffee_dd)
summary(lm1)
## 
## Call:
## lm(formula = total_vio ~ total_num, data = coffee_dd)
## 
## Residuals:
##       1       2       3       4       5 
## -21.165  -6.443  -1.107  16.665  12.050 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.92727    9.93968  -0.798    0.483    
## total_num    7.55386    0.08895  84.922  3.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.45 on 3 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9994 
## F-statistic:  7212 on 1 and 3 DF,  p-value: 3.599e-06
lm2 <- lm(total_vio ~ total_num*DBA,
         data = coffee_lm)
summary(lm2)
## 
## Call:
## lm(formula = total_vio ~ total_num * DBA, data = coffee_lm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.873 -21.656  -3.775  15.511  69.287 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -7.9273    26.6871  -0.297   0.7764    
## total_num       7.5539     0.2388  31.630 6.64e-08 ***
## DBA            86.8158    58.3784   1.487   0.1875    
## total_num:DBA   1.1942     0.4489   2.661   0.0375 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.85 on 6 degrees of freedom
## Multiple R-squared:  0.997,  Adjusted R-squared:  0.9956 
## F-statistic: 674.2 on 3 and 6 DF,  p-value: 5.653e-08
lm2 <- lm(total_score ~ total_num*DBA,
         data = coffee_lm)
summary(lm2)
## 
## Call:
## lm(formula = total_score ~ total_num * DBA, data = coffee_lm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -79.691 -23.202  -4.934  23.618  95.573 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.2836    32.3629  -0.132    0.899    
## total_num       7.0966     0.2896  24.503 3.04e-07 ***
## DBA            84.9386    70.7942   1.200    0.275    
## total_num:DBA  -0.2186     0.5443  -0.402    0.702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.82 on 6 degrees of freedom
## Multiple R-squared:  0.9939, Adjusted R-squared:  0.9909 
## F-statistic: 327.5 on 3 and 6 DF,  p-value: 4.881e-07

Supervised Machine Learning

Question: Whether can be told the coffee store is Starbucks or not?

Binary outcome in this case.

library(caret)

coffee$DBA = ifelse(coffee$DBA == "DD", 0, 1)
coffee$DBA <- factor(coffee$DBA, 
                     labels = c("Starbucks", "DD"), 
                     levels = 1:0) 
set.seed(12345)
in_train <- createDataPartition(y = coffee$DBA, 
                                p = 0.8, list = FALSE)
training <- coffee[ in_train, ]
testing  <- coffee[-in_train, ]
logit <- glm(DBA ~ checks + violations + score + BORO, 
             data = training, family = binomial(link = "logit"))

y_hat_logit <- predict(logit, newdata = testing, type = "response")
z_logit <- factor(y_hat_logit > 0.5, 
                  levels = c(TRUE, FALSE), 
                  labels = c("Starbucks", "DD"))

confusionMatrix(z_logit, reference = testing$DBA)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Starbucks  DD
##   Starbucks        34 101
##   DD               35  24
##                                           
##                Accuracy : 0.299           
##                  95% CI : (0.2355, 0.3687)
##     No Information Rate : 0.6443          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.2596         
##                                           
##  Mcnemar's Test P-Value : 2.494e-08       
##                                           
##             Sensitivity : 0.4928          
##             Specificity : 0.1920          
##          Pos Pred Value : 0.2519          
##          Neg Pred Value : 0.4068          
##              Prevalence : 0.3557          
##          Detection Rate : 0.1753          
##    Detection Prevalence : 0.6959          
##       Balanced Accuracy : 0.3424          
##                                           
##        'Positive' Class : Starbucks       
## 
LDA <- train(DBA ~ checks + violations + score + BORO, 
             data = training, method = "lda", 
             reProcess = c("center", "scale"))
z <- predict(LDA, newdata = testing)
confusionMatrix(z, testing$DBA)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Starbucks DD
##   Starbucks        38 26
##   DD               31 99
##                                           
##                Accuracy : 0.7062          
##                  95% CI : (0.6367, 0.7693)
##     No Information Rate : 0.6443          
##     P-Value [Acc > NIR] : 0.04083         
##                                           
##                   Kappa : 0.3484          
##                                           
##  Mcnemar's Test P-Value : 0.59624         
##                                           
##             Sensitivity : 0.5507          
##             Specificity : 0.7920          
##          Pos Pred Value : 0.5938          
##          Neg Pred Value : 0.7615          
##              Prevalence : 0.3557          
##          Detection Rate : 0.1959          
##    Detection Prevalence : 0.3299          
##       Balanced Accuracy : 0.6714          
##                                           
##        'Positive' Class : Starbucks       
##